##################################
### Mass run plant dispatch functions
### Takes using_dfcommon and using_dfsplit. 
### (Removed SLURM)
##################################


require(data.table)
require(lfe)
require(lubridate)
require(Formula)
require(rgdal)
require(rgeos)
require(sp)
require(sf)

WORK = file.path('Z:/user/ajk41/Solar siting/Scripts/JAERE Code Repository/Data JAERE')
WORK.OUT = file.path(WORK, 'intermediate_output')
dir.create(WORK.OUT)



#################### Functions
#### Some useful functions:
jGetNercsReg<-function(dat){
  if(class(dat)=='data.frame'|class(dat)=='data.table') {
    da = unique(dat$NERC) } else {
      da = unique(dat)}
  if(NROW(da)>1) stop('This is not a unique NERC region')
  
  EAST = c("FRCC", "MRO",  "NPCC", "RFC",  "SERC", "SPP")
  
  if(da=='WECC') return('WECC') 
  if(da=='TRE')  return('TRE')
  if(da%in%EAST) return(EAST)
  stop('NERC not recognized')
}


## plant-by-plant emissions and generation from XXX
df.split = readRDS(file.path(WORK.OUT, 'using_dfsplit.rds'))

## common hourly demand (aggregated to subregions) from YYY
df.common =  readRDS(file.path(WORK, 'using_dfcommon.rds')) 

setnames(df.common,old='hour', new='MW')
hourly.sum = df.common[,j = list(MW.sum = sum(MW, na.rm=T)), by = c('INT','dh_e')]
df.common[,PCA:=paste0('PCA_', substr(INT, 1, 1),NERC.SUBREGION)]


df.common = dcast(data = df.common[,.(dh_e, PCA, MW)],  dh_e ~ PCA, value.var = 'MW', fun.aggregate = sum, na.rm=T)
df.common[,c('mo','hour','year'):=list(month(dh_e), hour(dh_e), year(dh_e))]
PCAs = grep(pattern='PCA_', names(df.common), value=T)                                            

########################

#### Get generator-by-generator listing of "complete" mo-years:

dst.bad.dates = mdy('03/11/2007','11/04/2007',
                    '03/09/2008','11/02/2008',
                    '03/08/2009','11/01/2009',
                    '03/14/2010','11/07/2010',  #--> dropping all days with DST, since we estimate for weekday/weekend and month
                    '03/13/2011','11/06/2011',
                    '03/11/2012','11/04/2012',
                    '03/10/2013','11/03/2013',
                    '03/09/2014','11/02/2014',
                    '03/08/2015','11/01/2015', 
                    '03/13/2016','11/06/2016')

df.split = lapply(df.split, function(z){
  return(z[!dh_e %in% dst.bad.dates,])
})

df.common = df.common[!dh_e %in% dst.bad.dates,]



###
# To estimate 1,527 separate regressions (one for each plant), we simplify by using
# a qr decomposition of the RHS (hourly demand from each PCA + fixed effects)
# Essentially, the Gramm matrix (X'X)^{-1}X'
# To do this, we must categorize each plant in df.split 
# based on the complete months of data. Some plants are new or are missing some data, which means
# the qr decomposition for the RHS on one plant with full data will not work for another plant with a subset of the data.

# if any hours are missing from a plant's data for a given month, that entire month is dropped. Only complete months are used.

tot = unique(rbindlist( lapply(df.split, function(z) data.frame(dh_e = z$dh_e))))[-(1:5),] #hours 1, 2, 3, and 4 are missing most PCAs in df.common
tot = tot[dh_e %in% df.common[,dh_e],]

# get count of hours in year year-month
tot.hr =  tot %>%
  mutate(year = year(dh_e), month=month(dh_e), day = day(dh_e)) %>% 
  group_by(year, month, day) %>% summarize(NumHours = length(dh_e)) %>%
  group_by(year, month) %>% summarize(NumHours = sum(NumHours, na.rm=F))

# for each df.split(each ORISPL)
df.split.useMY = lapply(df.split, function(x){
    x = x[dh_e %in% tot$dh_e,]              # Select just the dh_e that are in tot (and thus are in df.common)
    x[!is.na(SO2_MASS) & !is.na(NOX_MASS) & !is.na(GLOAD_MASS),.(dh_e)] %>%  # drop those with missing output
    mutate(year = year(dh_e), month=month(dh_e), day=day(dh_e)) %>%
    group_by(year, month, day) %>% summarize(ObsHours = length(dh_e)) %>% # sum up the observed hours
    group_by(year, month) %>% summarize(ObsHours = sum(ObsHours, na.rm=T)) %>%
    left_join(tot.hr) %>% dplyr::filter(ObsHours==NumHours) %>%               # keep just the months that have the right number of hours.
    dplyr::select(year, month) %>% mutate(UseMY = as.logical(T)) %>% ungroup()
}) # end df.split.useMY


for(i in 1:length(df.split)){ # probably could be an mapply(...) call...
  x = setDT(df.split[[i]][dh_e %in% tot$dh_e,][!is.na(SO2_MASS) & !is.na(NOX_MASS) & !is.na(GLOAD_MASS),] %>%
    mutate(year = year(dh_e), month=month(dh_e), day=day(dh_e)) %>%
    group_by(year, month, day) %>% mutate(ym = paste(year, month, sep='-')) %>%
    ungroup() %>% dplyr::select(-year,-month, -day))
    
  y = df.split.useMY[[i]] %>% mutate(ym = paste(year, month, sep='-'))
  
  df.split[[i]] = x[ym %in% y$ym,]}  # each element of df.split is now just the year-months where that df.split yearmonth is equal to the exact length of the aggregated yearmonths



## Handle df.common, the common load:
df.common2 = df.common[dh_e %in% tot$dh_e,] %>%
  mutate(year = year(dh_e), month=month(dh_e), day=day(dh_e)) %>%
  group_by(year, month, day) %>% summarize(ObsHours = length(dh_e)) %>%
  group_by(year, month) %>% summarize(ObsHours = sum(ObsHours, na.rm=T)) %>%
  left_join(tot.hr) %>% dplyr::filter(ObsHours==NumHours) %>%
  dplyr::select(year, month) %>% mutate(UseMY = as.logical(T)) # check for 2010-11


df.common = df.common[dh_e %in% tot$dh_e,] %>% 
  mutate(year = year(dh_e), month=month(dh_e), day=day(dh_e)) %>%
  left_join(df.common2, by=c("year" = "year", "mo"="month")) %>%
  dplyr::filter(!is.na(UseMY)) %>% 
  mutate(ym = paste0(year, '-', mo))  %>%
  dplyr::select(-UseMY, -year)

rm(df.common2)
setDT(df.common)

## Check results--# NA errors will appear here
df.common[,c('NumNA','TotEn'):=list(rowSums(is.na(df.common[,PCAs, with=F])), 
                                    rowSums(df.common[,PCAs, with=F], na.rm=T))]
df.common[,Change:= (NumNA!=data.table::shift(NumNA, 1, type = 'lag', fill='F') | NumNA!=data.table::shift(NumNA, 1, type = 'lead', fill='F'))]
# View(df.common[Change==T,])  # All that are left are likely actual change dates when multiple PCAs went to 1 ISO.
                               # This should be absorbed by the month-of-sample:Generator FE's.
                               # as of 3-4, this is empty. 3-29, new PCAC; still empty.

## Replace NA's with zero (will fall out in demeaning)
df.common[is.na(df.common)] = 0  # setting the = to zero lets demeanlist demean them (to 0; no variation) which will drop them out of regression.
df.common[,weekday:=!weekdays(dh_e)%in%c('Saturday','Sunday')]
df.common[,Change:=NULL]

## Remove PCA's with no variance:
has.var = apply(df.common[,PCAs, with=F], MAR=2, var, na.rm=T)>0
PCAs = names(has.var)[which(has.var==T)]



WEST.PCA = grep('_W'   , names(df.common), value=T)
EAST.PCA = grep('_W|_T', names(df.common), value=T, invert = T)
TRE.PCA = grep('_T'    , names(df.common), value=T)

df.common.intx = list(EAST = df.common[,c('dh_e', 'ym', 'mo', 'hour', 'weekday', EAST.PCA[which(EAST.PCA %in% PCAs)]), with=F],
                      TRE  = df.common[,c('dh_e', 'ym', 'mo', 'hour', 'weekday',  TRE.PCA[which(TRE.PCA  %in% PCAs)]), with=F],
                      WEST = df.common[,c('dh_e', 'ym', 'mo', 'hour', 'weekday', WEST.PCA[which(WEST.PCA %in% PCAs)]), with=F])



df.split.intx = lapply(df.split, function(x){
  NERC = x$NERC[1]
  return(ifelse(NERC=='WECC', 'WEST',
                ifelse(NERC=='TRE', 'TRE', 'EAST')))
})

df.split.useMYS = split(df.split.useMY, unlist(df.split.intx)) # all the unique Month-Year combos, org. by intx

df.split.useMYS = lapply(df.split.useMYS, function(intx){
  temp = data.frame(ym = as.factor(unlist(lapply(intx, function(ORISPL){
          paste0(ORISPL$year,'-',ORISPL$month, collapse='--')  
          }))))
  temp$ymn = as.numeric(as.factor(temp$ym))
  return(temp)
})

for(intx in names(df.split.useMYS)){
df.split.useMYS[[intx]][,'ym'] = paste0(intx,'---',df.split.useMYS[[intx]][,'ym'])
df.split.useMYS[[intx]][,'ymn'] = paste0(intx,'---',df.split.useMYS[[intx]][,'ymn'])
}

To.Make = unique(rbindlist(lapply(df.split.useMYS, function(x){
  return(unique(x[,c('ymn','ym')]))
})))

jDecompose <- function(x){
  if(class(x)[1]%in%c('data.frame','data.table')) x = x$ym
  y = strsplit(x = x, split='---')
  z = unlist(strsplit(x=y[[1]][2], split='--'))
  return(data.frame(intx = y[[1]][1],
                  ym = z, stringsAsFactors=F))
} # takes the long ym value, parses out the intx, year-mo for the ym



## Diagnostic: check INTX-level hourly sum against hourly.sum, the same measure created before proc. data.
## checking to make sure no load was lost.
hourly.sum.after = lapply(df.common.intx, function(x){
  # df.common %>% group_by(INT, dh_e) %>% summarize(MW.sum = sum(MW, na.rm=T))  # hold onto this until end
  data.frame(dh_e = x$dh_e, apply(x[,grep(pattern='PCA_', names(x), value=T), with=F], MAR=1, sum, na.rm=T))
})

for(i in names(hourly.sum.after)){
  nm = names(hourly.sum.after[[i]]) 
  nm[2] = paste0(i,"_MW.sum")
  names(hourly.sum.after[[i]]) = nm
}

hsa = merge(x = hourly.sum.after[[1]], y = hourly.sum.after[[2]])
hsa = merge(hsa, hourly.sum.after[[3]])

hs = hourly.sum %>% spread(INT, MW.sum) %>% dplyr::select(dh_e, EAST.actual = EAST, TRE.actual = TRE, WEST.actual = WEST)
compare = data.table(merge(hs, hsa, all=F))  # "actual" is from beginning; the "hsa" is "after" and should be equal.
compare[,c('Ediff','Wdiff','Tdiff'):=list(EAST.actual - EAST_MW.sum, WEST.actual - WEST_MW.sum, TRE.actual - TRE_MW.sum)]
sum(is.na(compare$Ediff))
mean(compare$Ediff^2, na.rm=T) # pretty much zero
## End diagnostic; looks like the process conserves all MW load it starts with.


## Make a holding list (will hold the final (X'X)-1 matrix)
To.Make.list = split(To.Make, 1:NROW(To.Make))
names(To.Make.list) = To.Make$ymn


## Export for estimation (gpu on SLURM) ##
save(list = c('To.Make.list','df.common.intx','df.split','df.split.useMYS','jDecompose'), file = file.path(WORK.OUT, 'usingWorkspace.Rdata'))

#### End here #######





